library(tidyverse)
library(skimr)
library(lmtest)
library(car)
#reading the file with tidytuesdayR
tuesdata <- tidytuesdayR::tt_load(2024, week = 10)
## ---- Compiling #TidyTuesday Information for 2024-03-05 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "trashwheel.csv"
trashwheel <- tuesdata$trashwheel
str(trashwheel) # Check the structure
## spc_tbl_ [993 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : chr [1:993] "mister" "mister" "mister" "mister" ...
## $ Name : chr [1:993] "Mister Trash Wheel" "Mister Trash Wheel" "Mister Trash Wheel" "Mister Trash Wheel" ...
## $ Dumpster : num [1:993] 1 2 3 4 5 6 7 8 9 10 ...
## $ Month : chr [1:993] "May" "May" "May" "May" ...
## $ Year : num [1:993] 2014 2014 2014 2014 2014 ...
## $ Date : chr [1:993] "5/16/2014" "5/16/2014" "5/16/2014" "5/17/2014" ...
## $ Weight : num [1:993] 4.31 2.74 3.45 3.1 4.06 2.71 1.91 3.7 2.52 3.76 ...
## $ Volume : num [1:993] 18 13 15 15 18 13 8 16 14 18 ...
## $ PlasticBottles: num [1:993] 1450 1120 2450 2380 980 1430 910 3580 2400 1340 ...
## $ Polystyrene : num [1:993] 1820 1030 3100 2730 870 2140 1090 4310 2790 1730 ...
## $ CigaretteButts: num [1:993] 126000 91000 105000 100000 120000 90000 56000 112000 98000 130000 ...
## $ GlassBottles : num [1:993] 72 42 50 52 72 46 32 58 49 75 ...
## $ PlasticBags : num [1:993] 584 496 1080 896 368 ...
## $ Wrappers : num [1:993] 1162 874 2032 1971 753 ...
## $ SportsBalls : num [1:993] 7 5 6 6 7 5 3 6 6 7 ...
## $ HomesPowered : num [1:993] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_character(),
## .. Name = col_character(),
## .. Dumpster = col_double(),
## .. Month = col_character(),
## .. Year = col_double(),
## .. Date = col_character(),
## .. Weight = col_double(),
## .. Volume = col_double(),
## .. PlasticBottles = col_double(),
## .. Polystyrene = col_double(),
## .. CigaretteButts = col_double(),
## .. GlassBottles = col_double(),
## .. PlasticBags = col_double(),
## .. Wrappers = col_double(),
## .. SportsBalls = col_double(),
## .. HomesPowered = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(trashwheel) # Dimensions
## [1] 993 16
head(trashwheel) # Preview first few rows
## # A tibble: 6 × 16
## ID Name Dumpster Month Year Date Weight Volume PlasticBottles
## <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 mister Mister Trash W… 1 May 2014 5/16… 4.31 18 1450
## 2 mister Mister Trash W… 2 May 2014 5/16… 2.74 13 1120
## 3 mister Mister Trash W… 3 May 2014 5/16… 3.45 15 2450
## 4 mister Mister Trash W… 4 May 2014 5/17… 3.1 15 2380
## 5 mister Mister Trash W… 5 May 2014 5/17… 4.06 18 980
## 6 mister Mister Trash W… 6 May 2014 5/20… 2.71 13 1430
## # ℹ 7 more variables: Polystyrene <dbl>, CigaretteButts <dbl>,
## # GlassBottles <dbl>, PlasticBags <dbl>, Wrappers <dbl>, SportsBalls <dbl>,
## # HomesPowered <dbl>
colnames(trashwheel) # List column names
## [1] "ID" "Name" "Dumpster" "Month"
## [5] "Year" "Date" "Weight" "Volume"
## [9] "PlasticBottles" "Polystyrene" "CigaretteButts" "GlassBottles"
## [13] "PlasticBags" "Wrappers" "SportsBalls" "HomesPowered"
sapply(trashwheel, class) # Check data types
## ID Name Dumpster Month Year
## "character" "character" "numeric" "character" "numeric"
## Date Weight Volume PlasticBottles Polystyrene
## "character" "numeric" "numeric" "numeric" "numeric"
## CigaretteButts GlassBottles PlasticBags Wrappers SportsBalls
## "numeric" "numeric" "numeric" "numeric" "numeric"
## HomesPowered
## "numeric"
table(trashwheel$Month) # Checking the month variable values
##
## April August December February January july July June
## 103 99 90 38 41 2 134 121
## March May November October september September
## 52 83 94 52 1 83
# Count missing values per column
colSums(is.na(trashwheel))
## ID Name Dumpster Month Year
## 0 0 0 0 0
## Date Weight Volume PlasticBottles Polystyrene
## 0 0 0 1 1
## CigaretteButts GlassBottles PlasticBags Wrappers SportsBalls
## 1 251 1 144 364
## HomesPowered
## 0
# Impute or remove missing values (example: imputation for numeric columns)
trashwheel <- trashwheel %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
# Remove duplicates
trashwheel <- trashwheel %>%
distinct()
# Convert Year to a factor
trashwheel$Year <- as.factor(trashwheel$Year)
unique(trashwheel$Year)
## [1] 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
## Levels: 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
# Convert the Month column to lowercase so I'll only have 12 types
trashwheel$Month <- tolower(trashwheel$Month)
# Check the unique values to verify there are now only 12 months
unique(trashwheel$Month)
## [1] "may" "june" "july" "august" "september" "october"
## [7] "november" "december" "january" "february" "march" "april"
#making a YearMonth variable
trashwheel$YearMonth <- paste(trashwheel$Year, trashwheel$Month, sep = "-")
#Checking unique values
unique(trashwheel$YearMonth)
## [1] "2014-may" "2014-june" "2014-july" "2014-august"
## [5] "2014-september" "2014-october" "2014-november" "2014-december"
## [9] "2015-january" "2015-february" "2015-march" "2015-april"
## [13] "2015-may" "2015-june" "2015-july" "2015-august"
## [17] "2015-september" "2015-october" "2015-november" "2015-december"
## [21] "2016-january" "2016-february" "2016-march" "2016-april"
## [25] "2016-may" "2016-june" "2016-july" "2016-august"
## [29] "2016-september" "2016-october" "2016-november" "2016-december"
## [33] "2017-january" "2017-february" "2017-march" "2017-april"
## [37] "2017-may" "2017-june" "2017-july" "2017-august"
## [41] "2017-september" "2017-october" "2017-november" "2018-january"
## [45] "2018-february" "2018-march" "2018-april" "2018-may"
## [49] "2018-june" "2018-july" "2018-august" "2018-september"
## [53] "2018-october" "2018-november" "2018-december" "2019-january"
## [57] "2019-february" "2019-march" "2019-april" "2019-may"
## [61] "2019-june" "2019-july" "2019-august" "2019-october"
## [65] "2019-november" "2019-december" "2020-january" "2020-february"
## [69] "2020-march" "2020-april" "2020-may" "2020-june"
## [73] "2020-july" "2020-august" "2020-september" "2020-october"
## [77] "2020-november" "2020-december" "2021-january" "2021-february"
## [81] "2021-march" "2021-april" "2021-may" "2021-june"
## [85] "2021-july" "2021-august" "2021-september" "2021-october"
## [89] "2021-november" "2021-december" "2022-january" "2022-march"
## [93] "2022-february" "2022-april" "2022-may" "2022-june"
## [97] "2022-july" "2022-august" "2022-september" "2022-october"
## [101] "2022-november" "2022-december" "2023-january" "2023-february"
## [105] "2023-march" "2023-april" "2023-may" "2023-june"
## [109] "2023-july" "2023-august" "2023-september" "2023-october"
## [113] "2023-november" "2023-december"
unique(trashwheel$HomesPowered)
## [1] 0 42 64 31 34 57 54 59 33 71 94 58 56 78 53 80 41 48 66 68 51 61 62 50 72
## [26] 55 32 74 60 35 75 47 70 81 69 49 40 43 52 39 67 84 46 29 36 44 45 65 25 37
## [51] 63 27 16 73 77 38 76 85 30 13 24 22 87 28 89 26 20 17 21 23 12 18 10 19 11
## [76] 14
unique(trashwheel$Name)
## [1] "Mister Trash Wheel" "Professor Trash Wheel" "Captain Trash Wheel"
## [4] "Gwynnda Trash Wheel"
# Summary statistics
skim(trashwheel)
| Name | trashwheel |
| Number of rows | 993 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| factor | 1 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1 | 6 | 9 | 0 | 4 | 0 |
| Name | 0 | 1 | 18 | 21 | 0 | 4 | 0 |
| Month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
| Date | 0 | 1 | 6 | 10 | 0 | 623 | 0 |
| YearMonth | 0 | 1 | 8 | 14 | 0 | 114 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Year | 0 | 1 | FALSE | 10 | 202: 165, 202: 161, 202: 128, 201: 123 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Dumpster | 0 | 1 | 230.88 | 185.82 | 1.00 | 73.00 | 176.00 | 381.00 | 629.00 | ▇▅▂▂▂ |
| Weight | 0 | 1 | 2.97 | 0.84 | 0.61 | 2.45 | 3.04 | 3.53 | 5.62 | ▁▅▇▃▁ |
| Volume | 0 | 1 | 14.92 | 1.61 | 5.00 | 15.00 | 15.00 | 15.00 | 20.00 | ▁▁▁▇▁ |
| PlasticBottles | 0 | 1 | 2219.33 | 1649.62 | 0.00 | 990.00 | 1900.00 | 2900.00 | 9830.00 | ▇▆▁▁▁ |
| Polystyrene | 0 | 1 | 1436.87 | 1831.51 | 0.00 | 240.00 | 750.00 | 2130.00 | 11528.00 | ▇▂▁▁▁ |
| CigaretteButts | 0 | 1 | 13728.12 | 24037.49 | 0.00 | 2900.00 | 4900.00 | 12000.00 | 310000.00 | ▇▁▁▁▁ |
| GlassBottles | 0 | 1 | 20.96 | 13.19 | 0.00 | 12.00 | 20.96 | 24.00 | 110.00 | ▇▂▁▁▁ |
| PlasticBags | 0 | 1 | 984.00 | 1411.63 | 0.00 | 240.00 | 540.00 | 1210.00 | 13450.00 | ▇▁▁▁▁ |
| Wrappers | 0 | 1 | 2238.76 | 2508.23 | 0.00 | 960.00 | 1700.00 | 2238.76 | 20100.00 | ▇▁▁▁▁ |
| SportsBalls | 0 | 1 | 13.59 | 7.75 | 0.00 | 8.00 | 13.59 | 14.00 | 56.00 | ▅▇▁▁▁ |
| HomesPowered | 0 | 1 | 45.85 | 18.23 | 0.00 | 38.00 | 49.00 | 58.00 | 94.00 | ▂▂▇▅▁ |
#making a new dataframe with a YearMOnth grouping and averaging volume and wheight
trashwheel_volume_weight <- trashwheel %>%
group_by(YearMonth) %>%
summarise(
mean_volume = mean(Volume, na.rm = TRUE),
mean_weight = mean(Weight, na.rm = TRUE),
.groups = "drop"
)
# Plot Mean Weight over Time
ggplot(trashwheel_volume_weight, aes(x = YearMonth, y = mean_weight, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(title = "Mean Weight Over Time", x = "Year-Month", y = "Mean Weight") +
theme_minimal() +
scale_x_discrete(breaks = trashwheel_volume_weight$YearMonth[seq(1, nrow(trashwheel_volume_weight), by = 12)]) # Show every 12th value
#plot Mean Volume over Time
ggplot(trashwheel_volume_weight, aes(x = YearMonth, y = mean_volume, group = 1)) +
geom_line(color = "red") +
geom_point(color = "red") +
labs(title = "Mean Volume Over Time", x = "Year-Month", y = "Mean Volume") +
theme_minimal() +
scale_x_discrete(breaks = trashwheel_volume_weight$YearMonth[seq(1, nrow(trashwheel_volume_weight), by = 12)]) # Show every 12th value
#making another dataframe with Yearmonth and Name grouping and homespowered averaged
trashwheel_homespowered <- trashwheel %>%
group_by(YearMonth, Name) %>%
summarise(
mean_homespowered = mean(as.numeric(HomesPowered), na.rm = TRUE), # Convert HomePowered to numeric if needed
.groups = "drop"
)
#Plot: homespowered over time, with distinct trashwheels
ggplot(trashwheel_homespowered, aes(x = YearMonth, y = mean_homespowered, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(
title = "Mean Homes Powered Over Time by Trashwheel",
x = "Year-Month",
y = "Mean Homes Powered"
) +
theme_minimal() +
facet_wrap(~ Name)
# Group by YearMonth and calculate means for specified trash types
trashwheel_summary <- trashwheel %>%
group_by(YearMonth) %>%
summarise(
PlasticBottles = mean(PlasticBottles, na.rm = TRUE),
Polystyrene = mean(Polystyrene, na.rm = TRUE),
CigaretteButts = mean(CigaretteButts, na.rm = TRUE),
GlassBottles = mean(GlassBottles, na.rm = TRUE),
PlasticBags = mean(PlasticBags, na.rm = TRUE),
Wrappers = mean(Wrappers, na.rm = TRUE),
SportsBalls = mean(SportsBalls, na.rm = TRUE),
.groups = "drop"
)
# Reshape the data to long format
trashwheel_long <- trashwheel_summary %>%
pivot_longer(
cols = c(PlasticBottles, Polystyrene, CigaretteButts, GlassBottles, PlasticBags, Wrappers, SportsBalls),
names_to = "TrashType",
values_to = "MeanAmount"
)
# Plot: trends of trash types over time
ggplot(trashwheel_long, aes(x = YearMonth, y = MeanAmount, color = TrashType, group = TrashType)) +
geom_line(size = 1.2) +
scale_y_log10(labels = scales::comma_format()) + # Logarithmic scale for y-axis
scale_x_discrete(breaks = trashwheel_summary$YearMonth[seq(1, nrow(trashwheel_summary), by = 12)]) +
scale_color_brewer(palette = "Dark2") +
labs(
title = "Trends of Trash Types Over Time (Logarithmic Scale)",
x = "Year-Month",
y = "Mean Amount (Log Scale)",
color = "Trash Type"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
trash_model <- lm(Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
GlassBottles + PlasticBags + Wrappers + SportsBalls,
data = trashwheel)
# View the model summary
summary(trash_model)
##
## Call:
## lm(formula = Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
## GlassBottles + PlasticBags + Wrappers + SportsBalls, data = trashwheel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.41998 -0.47082 0.05192 0.53051 2.32476
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.199e+00 6.141e-02 52.099 < 2e-16 ***
## PlasticBottles -5.825e-05 2.165e-05 -2.691 0.00725 **
## Polystyrene -6.296e-05 2.326e-05 -2.706 0.00692 **
## CigaretteButts 8.303e-06 1.564e-06 5.309 1.36e-07 ***
## GlassBottles -4.902e-03 2.771e-03 -1.769 0.07717 .
## PlasticBags 9.760e-05 2.677e-05 3.646 0.00028 ***
## Wrappers -1.019e-04 1.558e-05 -6.543 9.69e-11 ***
## SportsBalls 7.898e-03 3.613e-03 2.186 0.02906 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7775 on 985 degrees of freedom
## Multiple R-squared: 0.1562, Adjusted R-squared: 0.1502
## F-statistic: 26.06 on 7 and 985 DF, p-value: < 2.2e-16
#Checking all assumptions for the simple model ## Checking Influential Points
# Check for influential points using Cook's distance
cooks_d_trash <- cooks.distance(trash_model)
# Visualize Cook's Distance
cooks_data_trash <- data.frame(Observation = 1:length(cooks_d_trash), Cooks_Distance = cooks_d_trash)
ggplot(cooks_data_trash, aes(x = Observation, y = Cooks_Distance)) +
geom_point() +
geom_hline(yintercept = 4 / length(cooks_d_trash), color = "red", linetype = "dashed") +
labs(title = "Cook's Distance for Influential Observations",
x = "Observation",
y = "Cook's Distance") +
theme_minimal()
# Identify observations with Cook's distance greater than 4/n
influentials_trash <- which(cooks_d_trash > (4 / nrow(trashwheel)))
influentials_trash
## 13 18 19 20 21 26 30 34 36 50 62 67 94 117 129 130 165 272 329 407
## 13 18 19 20 21 26 30 34 36 50 62 67 94 117 129 130 165 272 329 407
## 478 486 518 574 578 633 636 637 638 639 642 644 645 647 648 651 657 658 662 666
## 478 486 518 574 578 633 636 637 638 639 642 644 645 647 648 651 657 658 662 666
## 671 673 680 696 709 716 726 754
## 671 673 680 696 709 716 726 754
###Checking normality of residuals
# Calculate residuals
residuals_trash <- residuals(trash_model)
# Plot histogram of residuals
ggplot(data.frame(residuals_trash = residuals_trash), aes(x = residuals_trash)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "blue", alpha = 0.7) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
# Q-Q Plot for residuals
qqnorm(residuals_trash)
qqline(residuals_trash, col = "red", lwd = 2)
# Get the fitted values
fitted_values_trash <- fitted(trash_model)
# Create a data frame for residuals vs fitted values
residuals_vs_fitted_df_trash <- data.frame(fitted_values_trash, residuals_trash = residuals(trash_model))
# Residuals vs Fitted plot
ggplot(residuals_vs_fitted_df_trash, aes(x = fitted_values_trash, y = residuals_trash)) +
geom_point(color = "skyblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Residuals vs Fitted Values", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Scale-Location Plot for Visual Check
plot(trash_model, which = 3)
# Breusch-Pagan Test for Homoscedasticity
bptest_trash <- bptest(trash_model)
bptest_trash
##
## studentized Breusch-Pagan test
##
## data: trash_model
## BP = 16.938, df = 7, p-value = 0.0178
# Calculate and check the VIF for the model
vif_values_trash <- vif(trash_model)
# Print VIF values
print(vif_values_trash)
## PlasticBottles Polystyrene CigaretteButts GlassBottles PlasticBags
## 2.092729 2.978833 2.319319 2.190775 2.342616
## Wrappers SportsBalls
## 2.505624 1.285790
trashwheel$log_Weight <- log(trashwheel$Weight)
# building the logarthimical mdoel
trash_model_log <- lm(log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts + GlassBottles + PlasticBags + Wrappers + SportsBalls, data = trashwheel)
# Check for influential points using Cook's distance
cooks_d_trash_log <- cooks.distance(trash_model_log)
# Visualize Cook's Distance
cooks_data_trash_log <- data.frame(Observation = 1:length(cooks_d_trash_log), Cooks_Distance = cooks_d_trash_log)
ggplot(cooks_data_trash_log, aes(x = Observation, y = Cooks_Distance)) +
geom_point() +
geom_hline(yintercept = 4 / length(cooks_d_trash_log), color = "red", linetype = "dashed") +
labs(title = "Cook's Distance for Influential Observations (Log Model)",
x = "Observation",
y = "Cook's Distance") +
theme_minimal()
# Identify observations with Cook's distance greater than 4/n
influentials_trash_log <- which(cooks_d_trash_log > (4 / nrow(trashwheel)))
influentials_trash_log
## 20 26 30 34 50 62 94 117 130 272 407 632 633 636 637 638 639 640 642 644
## 20 26 30 34 50 62 94 117 130 272 407 632 633 636 637 638 639 640 642 644
## 645 647 648 650 651 654 657 658 662 666 671 673 680 687 696 697 709 710 715 716
## 645 647 648 650 651 654 657 658 662 666 671 673 680 687 696 697 709 710 715 716
## 726 749 750 754 756 759 766 770 774 778 830 952
## 726 749 750 754 756 759 766 770 774 778 830 952
# Calculate residuals
residuals_trash_log <- residuals(trash_model_log)
# Plot histogram of residuals
ggplot(data.frame(residuals_trash_log = residuals_trash_log), aes(x = residuals_trash_log)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "blue", alpha = 0.7) +
labs(title = "Histogram of Residuals (Log Model)", x = "Residuals", y = "Frequency") +
theme_minimal()
# Q-Q Plot for residuals
qqnorm(residuals_trash_log)
qqline(residuals_trash_log, col = "red", lwd = 2)
# Get the fitted values
fitted_values_trash_log <- fitted(trash_model_log)
# Create a data frame for residuals vs fitted values
residuals_vs_fitted_df_trash_log <- data.frame(
fitted_values_trash_log,
residuals_trash_log = residuals(trash_model_log)
)
# Residuals vs Fitted plot
ggplot(residuals_vs_fitted_df_trash_log, aes(x = fitted_values_trash_log, y = residuals_trash_log)) +
geom_point(color = "skyblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Residuals vs Fitted Values (Log Model)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Scale-Location Plot for Visual Check
plot(trash_model_log, which = 3)
# Breusch-Pagan Test for Homoscedasticity
bptest_trash_log <- bptest(trash_model_log)
bptest_trash_log
##
## studentized Breusch-Pagan test
##
## data: trash_model_log
## BP = 10.729, df = 7, p-value = 0.1509
# Calculate and check the VIF for the model
vif_values_trash_log <- vif(trash_model_log)
# Print VIF values
print(vif_values_trash_log)
## PlasticBottles Polystyrene CigaretteButts GlassBottles PlasticBags
## 2.092729 2.978833 2.319319 2.190775 2.342616
## Wrappers SportsBalls
## 2.505624 1.285790
# Create the Season variable based on Month
trashwheel$Season <- case_when(
trashwheel$Month %in% c("december", "january", "february") ~ "Winter",
trashwheel$Month %in% c("march", "april", "may") ~ "Spring",
trashwheel$Month %in% c("june", "july", "august") ~ "Summer",
trashwheel$Month %in% c("september", "october", "november") ~ "Autumn",
TRUE ~ NA_character_
)
# Convert Season to a factor variable
trashwheel$Season <- factor(trashwheel$Season, levels = c("Winter", "Spring", "Summer", "Autumn"))
# Verify the new variable
table(trashwheel$Season)
##
## Winter Spring Summer Autumn
## 169 238 356 230
# Ensure Month is lowercase to avoid mismatches
trashwheel$Month <- tolower(trashwheel$Month)
# Create the Season variable based on Month
trashwheel$Season <- case_when(
trashwheel$Month %in% c("december", "january", "february") ~ "Winter",
trashwheel$Month %in% c("march", "april", "may") ~ "Spring",
trashwheel$Month %in% c("june", "july", "august") ~ "Summer",
trashwheel$Month %in% c("september", "october", "november") ~ "Autumn",
TRUE ~ NA_character_
)
# Convert Season to a factor variable
trashwheel$Season <- factor(trashwheel$Season, levels = c("Winter", "Spring", "Summer", "Autumn"))
# Verify the new variable
table(trashwheel$Season)
##
## Winter Spring Summer Autumn
## 169 238 356 230
# Linear regression model: Predict Weight by also Season
season_model <- lm(Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
GlassBottles + PlasticBags + Wrappers + SportsBalls + Season, data = trashwheel)
# View the model summary
summary(season_model)
##
## Call:
## lm(formula = Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
## GlassBottles + PlasticBags + Wrappers + SportsBalls + Season,
## data = trashwheel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.37195 -0.46717 0.07302 0.51726 2.19564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.025e+00 8.484e-02 35.654 < 2e-16 ***
## PlasticBottles -6.237e-05 2.160e-05 -2.888 0.003966 **
## Polystyrene -5.878e-05 2.334e-05 -2.518 0.011944 *
## CigaretteButts 7.341e-06 1.590e-06 4.617 4.41e-06 ***
## GlassBottles -3.049e-03 2.803e-03 -1.088 0.276948
## PlasticBags 8.754e-05 2.677e-05 3.271 0.001110 **
## Wrappers -9.821e-05 1.554e-05 -6.320 3.95e-10 ***
## SportsBalls 7.631e-03 3.606e-03 2.116 0.034554 *
## SeasonSpring 2.760e-01 7.941e-02 3.475 0.000533 ***
## SeasonSummer 1.880e-01 7.390e-02 2.544 0.011115 *
## SeasonAutumn 1.027e-01 7.889e-02 1.302 0.193088
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7733 on 982 degrees of freedom
## Multiple R-squared: 0.1678, Adjusted R-squared: 0.1593
## F-statistic: 19.8 on 10 and 982 DF, p-value: < 2.2e-16
# Check for influential points using Cook's distance
cooks_d <- cooks.distance(season_model)
# Visualize Cook's Distance
cooks_data <- data.frame(Observation = 1:length(cooks_d), Cooks_Distance = cooks_d)
ggplot(cooks_data, aes(x = Observation, y = Cooks_Distance)) +
geom_point() +
geom_hline(yintercept = 4 / length(cooks_d), color = "red", linetype = "dashed") +
labs(title = "Cook's Distance for Influential Observations",
x = "Observation",
y = "Cook's Distance") +
theme_minimal()
# Identify observations with Cook's distance greater than 4/n
influentials <- which(cooks_d > (4 / nrow(trashwheel)))
influentials
## 13 19 20 26 30 34 50 62 67 94 107 113 117 122 130 272 329 407 486 518
## 13 19 20 26 30 34 50 62 67 94 107 113 117 122 130 272 329 407 486 518
## 574 578 631 633 638 639 644 645 647 651 657 658 662 673 696 709 716 726 749 750
## 574 578 631 633 638 639 644 645 647 651 657 658 662 673 696 709 716 726 749 750
## 754 755 756 770 830
## 754 755 756 770 830
# Calculate residuals
residuals_model <- residuals(season_model)
# Plot histogram
ggplot(data.frame(residuals = residuals_model), aes(x = residuals)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "blue", alpha = 0.7) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
# Get the fitted values
fitted_values <- fitted(season_model)
# Create a data frame for ggplot
residuals_vs_fitted_df <- data.frame(fitted_values, residuals = residuals(season_model))
# Residuals vs Fitted plot
ggplot(residuals_vs_fitted_df, aes(x = fitted_values, y = residuals)) +
geom_point(color = "skyblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Residuals vs Fitted Values-ordinal", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(season_model, which = 3)
# Breusch-Pagan Test for homoscedasticity
bptest(season_model)
##
## studentized Breusch-Pagan test
##
## data: season_model
## BP = 26.045, df = 10, p-value = 0.00368
# Calculate and check the VIF for the complex model
vif_values <- vif(season_model)
# Print VIF values
print(vif_values)
## GVIF Df GVIF^(1/(2*Df))
## PlasticBottles 2.105943 1 1.451187
## Polystyrene 3.031032 1 1.740986
## CigaretteButts 2.423324 1 1.556703
## GlassBottles 2.265123 1 1.505033
## PlasticBags 2.367857 1 1.538784
## Wrappers 2.519402 1 1.587262
## SportsBalls 1.294067 1 1.137571
## Season 1.096907 3 1.015535
# List of indices to remove
influential_indices <- c(13, 19, 20, 26, 30, 34, 50, 62, 67, 94, 107, 113, 117, 122, 130,
272, 329, 407, 486, 518, 574, 578, 631, 633, 638, 639, 644, 645,
647, 651, 657, 658, 662, 673, 696, 709, 716, 726, 749, 750, 754,
755, 756, 770, 830)
# Remove these rows from the trashwheel dataset
trashwheel_cleaned <- trashwheel %>%
filter(!row_number() %in% influential_indices)
# Check the first few rows to ensure the removal
head(trashwheel_cleaned)
## # A tibble: 6 × 19
## ID Name Dumpster Month Year Date Weight Volume PlasticBottles
## <chr> <chr> <dbl> <chr> <fct> <chr> <dbl> <dbl> <dbl>
## 1 mister Mister Trash W… 1 may 2014 5/16… 4.31 18 1450
## 2 mister Mister Trash W… 2 may 2014 5/16… 2.74 13 1120
## 3 mister Mister Trash W… 3 may 2014 5/16… 3.45 15 2450
## 4 mister Mister Trash W… 4 may 2014 5/17… 3.1 15 2380
## 5 mister Mister Trash W… 5 may 2014 5/17… 4.06 18 980
## 6 mister Mister Trash W… 6 may 2014 5/20… 2.71 13 1430
## # ℹ 10 more variables: Polystyrene <dbl>, CigaretteButts <dbl>,
## # GlassBottles <dbl>, PlasticBags <dbl>, Wrappers <dbl>, SportsBalls <dbl>,
## # HomesPowered <dbl>, YearMonth <chr>, log_Weight <dbl>, Season <fct>
# building the logarthimical mdoel
season_model_log <- lm(log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts + GlassBottles + PlasticBags + Wrappers + SportsBalls + Season, data = trashwheel)
# summary of the model
summary(season_model_log)
##
## Call:
## lm(formula = log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
## GlassBottles + PlasticBags + Wrappers + SportsBalls + Season,
## data = trashwheel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48573 -0.13359 0.06409 0.20567 0.62707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.072e+00 3.446e-02 31.104 < 2e-16 ***
## PlasticBottles -2.097e-05 8.775e-06 -2.389 0.017066 *
## Polystyrene -2.815e-05 9.482e-06 -2.968 0.003067 **
## CigaretteButts 2.793e-06 6.460e-07 4.324 1.69e-05 ***
## GlassBottles -1.137e-03 1.138e-03 -0.999 0.318269
## PlasticBags 4.064e-05 1.087e-05 3.738 0.000196 ***
## Wrappers -4.219e-05 6.312e-06 -6.683 3.91e-11 ***
## SportsBalls 2.803e-03 1.465e-03 1.913 0.055977 .
## SeasonSpring 9.812e-02 3.226e-02 3.042 0.002416 **
## SeasonSummer 5.855e-02 3.002e-02 1.950 0.051404 .
## SeasonAutumn 3.963e-02 3.205e-02 1.236 0.216571
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3142 on 982 degrees of freedom
## Multiple R-squared: 0.1649, Adjusted R-squared: 0.1564
## F-statistic: 19.39 on 10 and 982 DF, p-value: < 2.2e-16
# Check for influential points using Cook's distance
cooks_d_log <- cooks.distance(season_model_log)
# Visualize Cook's Distance
cooks_data_log <- data.frame(Observation = 1:length(cooks_d_log), Cooks_Distance = cooks_d_log)
ggplot(cooks_data_log, aes(x = Observation, y = Cooks_Distance)) +
geom_point() +
geom_hline(yintercept = 4 / length(cooks_d_log), color = "red", linetype = "dashed") +
labs(title = "Cook's Distance for Influential Observations",
x = "Observation",
y = "Cook's Distance") +
theme_minimal()
# Identify observations with Cook's distance greater than 4/n
influentials_log <- which(cooks_d_log > (4 / nrow(trashwheel_cleaned)))
influentials_log
## 20 26 30 31 34 50 94 117 272 407 631 632 633 638 639 642 644 645 647 648
## 20 26 30 31 34 50 94 117 272 407 631 632 633 638 639 642 644 645 647 648
## 651 657 658 662 666 673 680 687 696 697 709 716 744 749 750 754 755 756 758 759
## 651 657 658 662 666 673 680 687 696 697 709 716 744 749 750 754 755 756 758 759
## 761 765 766 767 768 769 770 773 774 830 952
## 761 765 766 767 768 769 770 773 774 830 952
# Calculate residuals for the log model
residuals_model_log <- residuals(season_model_log)
# Plot histogram
ggplot(data.frame(residuals = residuals_model_log), aes(x = residuals)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "blue", alpha = 0.7) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
theme_minimal()
# Get the fitted values for the log model
fitted_values_log <- fitted(season_model_log)
# Create a data frame for ggplot
residuals_vs_fitted_df_log <- data.frame(fitted_values_log, residuals = residuals(season_model_log))
# Residuals vs Fitted plot
ggplot(residuals_vs_fitted_df_log, aes(x = fitted_values_log, y = residuals)) +
geom_point(color = "skyblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Residuals vs Fitted Values-ordinal", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(season_model_log, which = 3)
# Breusch-Pagan Test for homoscedasticity for the log model
bptest(season_model_log)
##
## studentized Breusch-Pagan test
##
## data: season_model_log
## BP = 15.6, df = 10, p-value = 0.1117
# Calculate and check the VIF for the log model
vif_values_log <- vif(season_model_log)
# Print VIF values
print(vif_values_log)
## GVIF Df GVIF^(1/(2*Df))
## PlasticBottles 2.105943 1 1.451187
## Polystyrene 3.031032 1 1.740986
## CigaretteButts 2.423324 1 1.556703
## GlassBottles 2.265123 1 1.505033
## PlasticBags 2.367857 1 1.538784
## Wrappers 2.519402 1 1.587262
## SportsBalls 1.294067 1 1.137571
## Season 1.096907 3 1.015535
# the modfied version of trash_model
final_trash <- lm(log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
GlassBottles + PlasticBags + Wrappers + SportsBalls,
data = trashwheel)
# Modified version of season_model
final_season<- lm(log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
GlassBottles + PlasticBags + Wrappers + SportsBalls + Season,
data = trashwheel)
# Compare AIC values of the two models
AIC(final_trash, final_season)
## df AIC
## final_trash 9 535.3397
## final_season 12 531.4962
# Compare BIC values of the two models
BIC(final_trash, final_season)
## df BIC
## final_trash 9 579.4463
## final_season 12 590.3050
summary(final_trash)
##
## Call:
## lm(formula = log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
## GlassBottles + PlasticBags + Wrappers + SportsBalls, data = trashwheel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53493 -0.13867 0.05796 0.20876 0.62204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.133e+00 2.490e-02 45.485 < 2e-16 ***
## PlasticBottles -1.963e-05 8.777e-06 -2.237 0.02551 *
## Polystyrene -2.926e-05 9.432e-06 -3.102 0.00198 **
## CigaretteButts 3.090e-06 6.341e-07 4.873 1.28e-06 ***
## GlassBottles -1.765e-03 1.124e-03 -1.571 0.11661
## PlasticBags 4.400e-05 1.085e-05 4.054 5.42e-05 ***
## Wrappers -4.355e-05 6.317e-06 -6.895 9.62e-12 ***
## SportsBalls 2.846e-03 1.465e-03 1.943 0.05236 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3152 on 985 degrees of freedom
## Multiple R-squared: 0.1565, Adjusted R-squared: 0.1506
## F-statistic: 26.12 on 7 and 985 DF, p-value: < 2.2e-16
summary(final_season)
##
## Call:
## lm(formula = log_Weight ~ PlasticBottles + Polystyrene + CigaretteButts +
## GlassBottles + PlasticBags + Wrappers + SportsBalls + Season,
## data = trashwheel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48573 -0.13359 0.06409 0.20567 0.62707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.072e+00 3.446e-02 31.104 < 2e-16 ***
## PlasticBottles -2.097e-05 8.775e-06 -2.389 0.017066 *
## Polystyrene -2.815e-05 9.482e-06 -2.968 0.003067 **
## CigaretteButts 2.793e-06 6.460e-07 4.324 1.69e-05 ***
## GlassBottles -1.137e-03 1.138e-03 -0.999 0.318269
## PlasticBags 4.064e-05 1.087e-05 3.738 0.000196 ***
## Wrappers -4.219e-05 6.312e-06 -6.683 3.91e-11 ***
## SportsBalls 2.803e-03 1.465e-03 1.913 0.055977 .
## SeasonSpring 9.812e-02 3.226e-02 3.042 0.002416 **
## SeasonSummer 5.855e-02 3.002e-02 1.950 0.051404 .
## SeasonAutumn 3.963e-02 3.205e-02 1.236 0.216571
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3142 on 982 degrees of freedom
## Multiple R-squared: 0.1649, Adjusted R-squared: 0.1564
## F-statistic: 19.39 on 10 and 982 DF, p-value: < 2.2e-16
for the simple model (final_trash) predicting the log of weight based on different trash categories is as follows: log ( Weight ) = 1.133 − 0.00001963 × PlasticBottles − 0.00002926 × Polystyrene + 0.00000309 × CigaretteButts − 0.001765 × GlassBottles + 0.00004400 × PlasticBags − 0.00004355 × Wrappers + 0.002846 × SportsBalls
Two regression models were used to predict the log-transformed weight of trash: the simpler model (final_trash) and the more complex model (final_season) that includes an additional variable for seasons. Simple Model (final_trash): The simple model explained a small proportion of the variance in the log-transformed weight of trash, with an adjusted R-squared of 0.1506. The model had a significant F-statistic, 𝐹 ( 7 , 985 ) = 26.12 F(7,985)=26.12, 𝑝 < 2.2 𝑒 − 16 p<0.001, suggesting that the included variables were collectively significant predictors. Among the variables, PlasticBottles, Polystyrene, CigaretteButts, PlasticBags, Wrappers, and SportsBalls were significant predictors of log-transformed weight. GlassBottles, however, was not a significant predictor (p = 0.11661). Complex Model (final_season): The more complex model, which also includes the variable Season, accounted for a slightly larger proportion of variance in the log-transformed weight of trash, with an adjusted R-squared of 0.1564. The model’s overall fit was significant, with 𝐹 ( 10 , 982 ) = 19.39 F(10,982)=19.39, 𝑝 < 0,001. In this model, PlasticBottles, Polystyrene, CigaretteButts, PlasticBags, Wrappers, and SportsBalls remained significant, similar to the simple model. Additionally, SeasonSpring and SeasonSummer were found to be significant predictors, with SeasonAutumn showing no significant effect (p = 0.216571).
Although the complex model explains slightly more variance in the dependent variable (log-transformed weight of trash), with an adjusted R-squared increase from 0.1506 to 0.1564, the improvement is relatively small. This suggests that while seasonality may have a small effect on the weight of trash, the inclusion of season variables doesn’t significantly improve the model compared to the simpler one.
The simple model (final_trash) is preferred due to the simplicity and the fact that the addition of seasonal information in the complex model (final_season) does not substantially increase the explanatory power of the model. Removing the seasonality factor does not negatively impact the model’s predictive accuracy much, making the simpler model an effective choice in this case.